00-03 TCGA差异分析

xena TCGA的counts数据做差异分析

语法

#### ####:R中的目录,大标题
##:标题
#:注释
#@:阶段性成果
#!:值得注意
#?:存疑

代码

##设置工作目录
setwd("xena")

##安装包
install.packages("tidyverse")
##使用包,每次重新打开R都要library一下 或者右下勾选
library(tidyverse)

#xena数据库官网 包括TCGA https://xenabrowser.net/datapages/

##文件的读取和处理
#读取tsv文件
counts1 = read.table(file = 'TCGA-LIHC.htseq_counts.tsv', sep = '\t', header = TRUE)
#为新表x赋值1到3列
#x <- counts1[,1:3]
#改行名
rownames(counts1) <- counts1[,1] #Alt按-可打出<-
#只去除第一列
counts1 = counts1[,-1]
#介绍substr函数,选取字符串的1到4位
#substr("wanglihong",1,4)
#table函数,分组计算
table(substr(colnames(counts1),14,16))
#集合c,含两个值
#c("01A","11A")
#%in%符号用于判断是否属于,筛选出属于的
counts1 <- counts1[,substr(colnames(counts1),14,16)%in% c("01A","11A")]
#再table一次,看是否筛选出来了
table(substr(colnames(counts1),14,16))
#保留行名前15位
rownames(counts1) <- substr(rownames(counts1),1,15)
#ceiling函数,取大于目标值的最小整数
#ceiling(1.2)
#对counts1作数据转换,还原到原数据(原网页显示表中数据是原数据转化而来)
counts <- ceiling(2^(counts1)-1)

##文件的输出
#输出为文本
write.table(counts,"counts.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#@输出为表格,到这一步由counts1得到原数据counts的表
write.csv(counts, file = "counts.csv")

##24.12.14 03 TCGA差异分析
#设置工作目录
setwd("xena")
#读取之前txt格式的counts
counts <- read.table("counts.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
#加载基因注释文件
Ginfo_0 <- read.table("gene_length_Table.txt",sep = "\t",check.names = F,stringsAsFactors = F,header = T,row.names = 1)
#美元符号代表提取列,Ginfo_0genetypeGinfo<Ginfo0[which(Ginfo0genetype == "protein_coding"),] #只要编码RNA
#注意两个等号
#!我发现27行其实可以更精确从列选取

#取行名交集,获得一组行名的值,命名为comgene
comgene <- intersect(rownames(counts),rownames(Ginfo))
#选出counts中这些行名的行,即编码RNA的基因的行
counts <- counts[comgene,]
#判断数据类型,如"data.frame"是数据框
#class(counts)
#class(comgene) "character"字符
#?up说这里为已经是19627行的Ginfo赋值是为了使行顺序和comgene一致,我认为确实能使顺序一致,但我怀疑在此处本来顺序就是一致的。因为取交集之前两者的行都是按顺序排列。
Ginfo <- Ginfo[comgene,]
#判断两者行名是否完全相同(包括顺序)
a <- rownames(counts)
b <- rownames(Ginfo)
identical(a,b)

#把Ginfo里的列作为字符给counts新增一列为Gene。新增Gene Symbol
countsGene<as.character(Ginfogenename)
#?去掉Gene重复的行,为什么基因名字重复而基因id不重复呢,!应该表示非。当然,后面既然使用基因名作为行名,自然保险起见应该以基因名去重复
counts <- counts[!duplicated(countsGene),] #将行名变为Gene Symbol rownames(counts) <- countsGene
#计算有多少列
ncol(Ginfo)
#计算有多少行
nrow
#去除最后一列,列的总数即最后一列
counts <- counts[,-ncol(counts)]
#@输出筛选后的肝癌的编码RNA的基因的表达量的文件,到这一步counts加工了行名为genename
write.table(counts, file = "LIHC_counts_mRNA_all.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

#保存counts中01A即癌症患者的列名,其实跟27行一样,但是只有一个值,不需要集合,从而可以用双等于也不需要用%in%
#但有一个很大的不同,27行是表格中的列赋值到另一个表,这里是列名赋值到一组字符串
#?为什么这里的格式是这个呢,方括号一般是表啊,难道表示判断也可以。哦,也许是一组字符串的第几个也可以用方括号。
tumor <- colnames(counts)[substr(colnames(counts),14,16) == "01A"]
counts_01A <- counts[,tumor]
write.table(counts_01A, file = "LIHC_counts_mRNA_01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

#差异分析
library(tidyverse)
#安装BiocManager
library(BiocManager)
#通过BiocManager安装DESeq2,Update all/some/none? [a/s/n]: n 似乎跟tidyverse不一样只用来安装
ifinstall('DESeq2'
library(DESeq2)

#不知道
counts = counts[apply(counts, 1, function(x) sum(x > 1) > 32), ]
#分组,是tumor与normal相比
conditions=data.frame(sample=colnames(counts),
group=factor(ifelse(substr(colnames(counts),14,16) == "01A","T","N"),levels = c("N","T"))) %>%
column_to_rownames("sample")
#不知道
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = conditions,
design = ~ group)
#跑差异分析
dds <- DESeq(dds)
#"Intercept" "group_T_vs_N"
resultsNames(dds)
#提取结果res
res <- results(dds)
#一定要保存!保存之后以后可以直接用 DEG:differentially expressed genes 差异性表达的基因
#读取差异基因文件 直接在文件夹双击
save(res,file = "LIHC_DEG.rda")
#p值根据自己需要,abs是绝对值,大于0的很多,大于3的会少一点,表示基因在肿瘤中是正常的三倍以上
res_deseq2 <- as.data.frame(res)%>%
arrange(padj) %>%
dplyr::filter(abs(log2FoldChange) > 3, padj < 0.05)

笔记补充

setwd("xena")#set work directory

ctrl f#查找

install.packages("tidyverse")

library(tidyverse)#每次重新打开R都要library一下

TCGA / 癌症简称 / 缩写 / TCGA癌症中英文对照

Abbr 英文名称 中文名称
ACC Adrenocortical carcinoma 肾上腺皮质癌
BLCA Bladder Urothelial Carcinoma 膀胱尿路上皮癌
BRCA Breast invasive carcinoma 乳腺浸润癌
CESC Cervical squamous cell carcinoma and endocervical adenocarcinoma 宫颈鳞癌和腺癌
CHOL Cholangiocarcinoma 胆管癌
COAD Colon adenocarcinoma 结肠癌
COADREAD Colon adenocarcinoma/Rectum adenocarcinoma Esophageal carcinoma 结直肠癌
DLBC Lymphoid Neoplasm Diffuse Large B-cell Lymphoma 弥漫性大B细胞淋巴瘤
ESCA Esophageal carcinoma 食管癌
FPPP FFPE Pilot Phase II FFPE试点二期
GBM Glioblastoma multiforme 多形成性胶质细胞瘤
GBMLGG Glioma 胶质瘤
HNSC Head and Neck squamous cell carcinoma 头颈鳞状细胞癌
KICH Kidney Chromophobe 肾嫌色细胞癌
KIPAN Pan-kidney cohort (KICH+KIRC+KIRP) 混合肾癌
KIRC Kidney renal clear cell carcinoma 肾透明细胞癌
KIRP Kidney renal papillary cell carcinoma 肾乳头状细胞癌
LAML Acute Myeloid Leukemia 急性髓细胞样白血病
LGG Brain Lower Grade Glioma 脑低级别胶质瘤
LIHC Liver hepatocellular carcinoma 肝细胞肝癌
LUAD Lung adenocarcinoma 肺腺癌
LUSC Lung squamous cell carcinoma 肺鳞癌
MESO Mesothelioma 间皮瘤
OV Ovarian serous cystadenocarcinoma 卵巢浆液性囊腺癌
PAAD Pancreatic adenocarcinoma 胰腺癌
PCPG Pheochromocytoma and Paraganglioma 嗜铬细胞瘤和副神经节瘤
PRAD Prostate adenocarcinoma 前列腺癌
READ Rectum adenocarcinoma 直肠腺癌
SARC Sarcoma 肉瘤
SKCM Skin Cutaneous Melanoma 皮肤黑色素瘤
STAD Stomach adenocarcinoma 胃癌
STES Stomach and Esophageal carcinoma 胃和食管癌
TGCT Testicular Germ Cell Tumors 睾丸癌
THCA Thyroid carcinoma 甲状腺癌
THYM Thymoma 胸腺癌
UCEC Uterine Corpus Endometrial Carcinoma 子宫内膜癌
UCS Uterine Carcinosarcoma 子宫肉瘤
UVM Uveal Melanoma 葡萄膜黑色素瘤

UCSC Xena

counts1 = read.table(file = 'TCGA-LIHC.htseq_counts.tsv', sep = '\t', header = TRUE)#读取tsv文件
counts1 = read.table(file = ''TCGA-LIHC.htseq_counts.tsv'', sep = '\t', header = TRUE)#读取tsv 文件

a == b ,这是数学的等于,a = b是赋值

ENSG00000000003.13
基因编号

TCGA中样品取样组织类型编号对照包 sample type

01A 肿瘤
11A 正常

Sample Type Codes | NCI Genomic Data Commons

01 Primary Solid Tumor TP(实体瘤)
02 Recurrent Solid Tumor TR
03 Primary Blood Derived Cancer - Peripheral Blood TB (血液相关癌)
04 Recurrent Blood Derived Cancer - Bone Marrow TRBM
05 Additional - New Primary TAP
06 Metastatic TM
07 Additional Metastatic TAM
08 Human Tumor Original Cells THOC
09 Primary Blood Derived Cancer - Bone Marrow TBM
10 Blood Derived Normal NB (血液癌,正常)
11 Solid Tissue Normal NT(实体瘤,正常)
12 Buccal Cell Normal NBC
13 EBV Immortalized Normal NEBV
14 Bone Marrow Normal NBM
15 sample type 15 15SH
16 sample type 16 16SH
20 Control Analyte CELLC
40 Recurrent Blood Derived Cancer - Peripheral Blood TRB
50 Cell Lines CELL
60 Primary Xenograft Tissue XP
61 Cell Line Derived Xenograft Tissue XCL
99 sample type 99 99SH

alt 再按 - ,在R里可以打出<-,类似于等号

改行名 改列的位置

rownames 行名
colnames 列名

table函数 分组运算

筛选列

c("01A","11A"),把两者放到一个集合里

UCSC Xena
unit =log2(count+1)
count=2^counts -1